In [1]:
# !pip install spacy
# !python -m spacy.en.download
In [5]:
from IPython.display import SVG, display
import spacy
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
In [3]:
#encode some text as uncode
text = u"I'm executing this code on an Apple Computer."
#instantiate a language model
#to download language model: python -m spacy.en.download
nlp = spacy.load('en') # or spacy.en.English()
#create a document
document = nlp(text)
In [4]:
for function in nlp.pipeline:
print function
In [ ]:
### Modifying the Language Model
def identify_starwars(doc):
for token in doc:
if token.text == u'starwars':
token.tag_ = u'NNP'
def return_pipeline(nlp):
return [nlp.tagger, nlp.parser, nlp.matcher, nlp.entity, identify_starwars]
text = u"I loved all of the starwars movies"
custom_nlp = spacy.load('en', create_pipeline=return_pipeline)
new_document = custom_nlp(text)
for function in custom_nlp.pipeline:
print function
In [ ]:
texts = [u'You have brains in your head.'] * 10000
for doc in nlp.pipe(texts,n_threads=4):
doc.is_parsed
In [19]:
### Deploying Model on Many Texts with .pipe
runtimes = {}
for thread_count in [1,2,3,4,8]:
t0 = datetime.now()
#Create generator of processed documents
processed_documents = nlp.pipe(texts,n_threads=thread_count)
#Iterate over generator
for doc in processed_documents:
#pipeline is only run once we access the generator
doc.is_parsed
t1 = datetime.now()
runtimes[thread_count] = (t1 - t0).total_seconds()
ax = pd.Series(runtimes).plot(kind = 'bar')
ax.set_ylabel("Runtime (Seconds) with N Threads")
plt.show()
In [18]:
import pandas as pd
def info(obj):
return {'type':type(obj),'__str__': str(obj)}
text = u"""spaCy excels at large-scale information extraction tasks.
It's written from the ground up in carefully memory-managed Cython. """
document = nlp(text)
token = document[0]
span = document[0:3]
pd.DataFrame(map(info, [token,span,document]))
Out[18]:
In [6]:
print document.sents
for sent in document.sents:
print sent
In [7]:
for token in document:
print token
In [22]:
token = document[13]
print "text: %s" % token.text
print "suffix: %s" % token.suffix_
print "lemma: %s" % token.lemma_
In [8]:
#Part of speech and Dependency tagging
attrs = map(lambda token: {
"token":token
, "part of speech":token.pos_
, "Dependency" : token.dep_}
, document)
pd.DataFrame(attrs)
Out[8]:
In [9]:
print "noun chunks: {}".format(list(document.noun_chunks))
In [10]:
ents = [(ent, ent.root.ent_type_) for ent in document.ents]
print "entities: {}".format(ents)
In [27]:
#document, span, and token similarity
def plot_similarities(similarities, target):
import matplotlib.pyplot as plt
%matplotlib inline
f, ax = plt.subplots(1)
index = range(len(similarities))
ax.barh(index, similarities)
ax.set_yticks([i + .5 for i in index])
ax.set_yticklabels(document2)
ax.grid()
ax.set_title("Similarity to '{}'".format(target))
plt.show()
computer = nlp(u'computer')
document2 = nlp(u'You might be using a machine running Windows')
similarities = map(lambda token: token.similarity(computer),document2)
plot_similarities(similarities, computer)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: